In [1]:
import sys
In [2]:
sys.path.append('/mnt/home/ubuntu/projects/tools/')
In [3]:
import sys,re,json,os,csv,glob
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
import datetime,time,random,traceback
from geopy import distance
import geolocator
geo=geolocator.Geolocator()
geo.init()
In [4]:
files=glob.glob('../data/2014-*/*json')
files.sort()
print len(files)
print files[0]
In [5]:
tweets=[]
for file in files:
# Cycle through files
fileString=open(file,'r').read().decode('utf-8')
# Read file as one long string and convert to uniicode
fileDocs=[json.loads(line) for line in fileString.split('\n')]
fileDocs=[d for d in fileDocs if d['interaction']['tag_tree']['topic'].keys()[0] in ['Discrimination', 'Prevention']]
fileTweets=[t for t in fileDocs if t['interaction']['type'] in ['twitter']]
# Split into lines and load as JSON
tweets.extend(fileTweets)
# Add list of tweets from file to global list
print len(tweets)
In [4]:
nTime=0
nId=0
nCity=0
# For counting errors
cities=['Belo Horizonte', u'Brasília, Brasilia', u'Cuiabá', 'Curitiba', 'Fortaleza', 'Manaus', 'Natal, Rio Grande do Norte',
'Porto Alegre', 'Recife', 'Rio de Janeiro', 'Salvador, Bahia', u'São Paulo', 'Rio Branco, Acre', u'Maceió', u'Macapá',
u'Vitória, Espírito Santo', u'Goiânia', u'São Luís, Maranhão', 'Campo Grande, Mato Grosso do Sul', u'Belém, Pará',
u'João Pessoa, Paraíba', u'Teresina, Piauí', u'Porto Velho, Rondônia', 'Boa Vista, Roraima', u'Florianópolis',
'Aracaju, Sergipe', 'Palmas, Tocantins']
# Define cities to 'snap' coords to
coords=[]
coords=[geo.geoLocate(c)[0][1:3] for c in cities]
# Get coords from geolocator
tolerance=120
# Set tolerance to snap locations to nearest cities, in KM
In [6]:
outFile=csv.writer(open('cities.csv','w'),delimiter='\t')
for i,j in zip(cities,coords):
print i,j
In [7]:
print tweets[11]
In [8]:
def getClosestCity(tCoords):
'''Takes tuple of coordinates, cycles through cities
in global variable <cities>, reads their coords from
global variable <coords> and returns closest
------
returns tuple of coords of closest city,city name
OR None, if no city within tolerance'''
dist=999999
closest='ZZZZ'
cCoords=[]
for c,cc in enumerate(cities):
cDist=distance.distance(tCoords,coords[c])
if cDist<dist:
dist=cDist
closest=cc
cCoords=coords[c]
if dist<tolerance:
return cCoords,closest
else:
return None
In [9]:
import gender
g=gender.Gender()
g.gender(tweets[1]['interaction']['author']['name'])
Out[9]:
In [10]:
def mungeDate(dummyTime):
'''Takes Twitter timestamp
------
returns iso format timestamp -> YYY-MM-DD hh:mm:ss
'''
# Get from this format: Thu, 02 Jan 2014 16:26:15 +0000...
timeStruct=datetime.datetime.strptime(dummyTime,'%a, %d %b %Y %H:%M:%S +0000')
# Gets list with date/time components
return timeStruct
# ...into this format mm/DD/YYYYYYY-MM-DD hh:mm:ss
In [11]:
print coords
print coords[cities.index(u'São Paulo')]
getClosestCity(coords[cities.index(u'São Paulo')])
Out[11]:
In [12]:
outFile=csv.writer(open('../data/all.csv','w'))
# Open output file
nTime=nId=nCity=nRange=nCategory=nSubCategory=nTopic=0
# Reset error counters
outFile.writerow(['city','lat','lon','origdate','topic'])
for t,tweet in enumerate(tweets):
cityCoords=None
try:
tTime=tweet['interaction']['created_at']
except:
nTime+=1
try:
id=tweet['interaction']['id']
except:
nId+=1
try:
category=tweet['interaction']['tag_tree']['topic'].keys()[0]
except:
nCategory+=1
try:
subCategory=tweet['interaction']['tag_tree']['topic'].values()[0][0]
except:
nSubCategory+=1
try:
topic = category + "_" + subCategory
except:
nTopic+=1
if 'geo' in tweet['twitter'].keys():
res=getClosestCity([tweet['twitter']['geo']['latitude'],tweet['twitter']['geo']['longitude']])
if res:
# If location doesn't snap to chosen cities, within tolerance, then throw away
(cityCoords,city)=res
outFile.writerow([city.partition(',')[0].encode("utf-8"),cityCoords[0],cityCoords[1],mungeDate(tTime),topic])
else:
nRange+=1
else:
nCity+=1
# print tweet
# print 'FAILING...'
# print tweet.keys()
# sys.exit(1)
# All these tweets should have lat/long, if not stop and find out why
print nTime,nId,nCity,nRange,nCategory,nSubCategory,nTopic
In [13]:
!head ../data/all.csv
!wc ../data/all.csv
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [14]: